# Load required packages and data
library(tidyverse)
library(pander)
library(fitdistrplus)
library(VGAM)
library(AER)
dat <- readRDS("t_test_data_sbs_pt01_10sub_10clust_10000sims.rds")

Data-generating mechanism

Gathering data on how a ‘naive’ NLME model estimates those parameters, as well as what equal- and unequal-variance t-tests show.

What else is going on when we get the tiny \(\sigma_b^2\) estimates?

# List of items in the 'dat' set:
pval_model <- unlist(dat[1,])
est_sb2 <- unlist(dat[2,])
resid_var_model <- unlist(dat[3,])
# [4] modelNLME <- unlist(dat[4,])
clustvar0 <- unlist(dat[5,])
clustvar1 <- unlist(dat[6,])
f_test_dat <- unlist(dat[7,])
# [8] ttest_eq_var <- unlist(dat[8,])
# [9] ttest_uneq_var <- unlist(dat[9,])
# [10] clustmeans <- unlist(dat[10,])
pval_tt_eq_var <- unlist(dat[11,])
pval_tt_uneq_var <- unlist(dat[12,])
df_tt_uneq_var <- unlist(dat[13,])
# Set the true value
sb2 <- .01

# tiny_est identifies the data sets that produced very small $\sigma_b^2$ estimates.
tiny_est <- est_sb2 < 1e-6 # let's assume anything less than 1e-6 are the weird ones


# Are the 

var_dat <- cbind.data.frame(clustvar0, clustvar1, est_sb2, meanclustvar = (clustvar0+clustvar1)/2) %>% gather(key = "type", value = "variance", 1:4)
ggplot(data = var_dat) +
  geom_histogram(aes(x = variance, y = ..density..), color = "black", fill = "grey") +
  geom_vline(aes(xintercept = sb2)) +
  facet_grid(rows = vars(type), labeller = label_both) +
  labs(title = "WTF?")

f0 <- fitdist(clustvar0*(20/sb2), distr = "chisq", start = list(df = 1))
f1 <- fitdist(clustvar0, distr = "norm")

ggplot(data = as.data.frame(clustvar0)) +
  geom_histogram(aes(x = clustvar0*(20/sb2), y = ..density..), color = "black", fill = "grey") +
  stat_function(fun = dchisq, args = list(df = f0$estimate), color = "red") +
  labs(title = "cluster mean variances don't follow a chi-squared distribution")

ggplot(data = as.data.frame(clustvar0)) +
  geom_histogram(aes(x = clustvar0, y = ..density..), color = "black", fill = "grey") +
  stat_function(fun = dnorm, args = list(mean = f1$estimate[1], sd = f1$estimate[2]), color = "red") +
  labs(title = "cluster mean variances are sort of normal-ish, but still too large")

# Let's look at it a different way:
ggplot(data = cbind.data.frame(pval_tt_eq_var, pval_tt_uneq_var, tiny_est)) +
  geom_point(aes(x = pval_tt_eq_var, y = pval_tt_uneq_var, color = tiny_est),alpha = .2) +
  labs(title = "p-values from equal/unequal variance t tests are basically the same")

ggplot(data = cbind.data.frame(pval_tt_uneq_var, tiny_est)) +
  geom_histogram(aes(x = pval_tt_uneq_var), color = "black", fill = "grey") + facet_grid(rows = vars(tiny_est), labeller = label_both) + xlab("p-value for welch t-test, allows unequal variances") + labs(title = "p-v")

ggplot(data = cbind.data.frame(f_test_dat, pval_tt_uneq_var, tiny_est)) +
  geom_point(aes(x = f_test_dat, y = pval_tt_uneq_var, color = tiny_est),alpha = .5) +
  labs(title = "t test p-values seem uncorrelated with F test p-values of differing variances by arm")

ggplot(data = cbind.data.frame(f_test_dat, tiny_est)) +
  geom_histogram(aes(x = f_test_dat), color = "black", fill = "grey") + facet_grid(rows = vars(tiny_est), labeller = label_both) + xlab("p-value for F test of different cluster-mean variances by arm") + labs(title = "Different arm variances don't seem more common when the model estimate is tiny")

ggplot(data = cbind.data.frame(est_sb2, clustvar0, clustvar1, tiny_est, arm_var_diff = abs(clustvar0 - clustvar1))) +
  geom_point(aes(x = clustvar0, y = clustvar1, color = est_sb2), alpha = .3) +
  geom_abline(slope = 1, intercept = 0) +
  facet_grid(cols = vars(tiny_est), labeller = label_both)

Trying A Tobit Fit

tdat <- cbind.data.frame(est_sb2)
tmod <- AER::tobit(est_sb2 ~ 1, left = 1e-5, data = tdat)
summary(tmod)
## 
## Call:
## AER::tobit(formula = est_sb2 ~ 1, left = 1e-05, data = tdat)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##          10000           4362           5638              0 
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.0057903  0.0005095   11.37   <2e-16 ***
## Log(scale)  -3.1260301  0.0103109 -303.18   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 0.04389 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 5 
## Log-likelihood:  6372 on 2 Df
ggplot(data = as.data.frame(est_sb2)) +
  geom_histogram(aes(x = est_sb2, y = ..density..), color = "black", fill = "grey", binwidth = .006) +
  geom_vline(aes(xintercept = summary(tmod)$coefficients[1,1]), linetype = "dotdash", color = "red") +
  geom_vline(aes(xintercept = sb2), color = "black") +
  stat_function(fun = dnorm, args = list(mean = summary(tmod)$coefficients[1,1], sd = summary(tmod)$scale), color = "red") +
  labs(title = "Tobit looks probable; note that mean of Tobit is lower than true value")

ggplot(data = as.data.frame(est_sb2)) +
  geom_histogram(aes(x = est_sb2, y = ..density..), color = "black", fill = "grey", binwidth = .003) +
  geom_vline(aes(xintercept = summary(tmod)$coefficients[1,1]), linetype = "dotdash", color = "red") +
  geom_vline(aes(xintercept = sb2), color = "black") +
  stat_function(fun = dnorm, args = list(mean = summary(tmod)$coefficients[1,1], sd = summary(tmod)$scale), color = "red") +
  ylim(c(0, 15))+
  labs(title = "Same graph, focusing on fit of non-zero parts")
## Warning: Removed 1 rows containing missing values (geom_bar).